from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

X = ['我 爱 你','我 恨 你 恨 你']
y = [0,1]

tfCoder = CountVectorizer(token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")  # TF模型
X1 = tfCoder.fit_transform(X)  # 训练TF模型
print(tfCoder.get_feature_names())
print(X1.toarray())

tiCoder = TfidfVectorizer(norm=None,token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")
X2 = tiCoder.fit_transform(X)
print(tiCoder.get_feature_names())
print(X2.toarray())

import numpy as np

idf = np.log(3/2) + 1
print(idf)